Load required libraries

library(data.table)
library(ggplot2)
library(ggmosaic)
library(readr)
library(stringr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(arules)
## Loading required package: Matrix
## 
## Attaching package: 'arules'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following objects are masked from 'package:base':
## 
##     abbreviate, write
library(arulesViz)

Loading the dataset

#install.packages('readxl')
library('readxl')
trans <- read_excel('QVI_transaction_data.xlsx')

Exploratory data analysis

The first step in any analysis is to first understand the data. Let’s take a look at each of the datasets provided.

Examine transaction data

#showing head ( top 10 rows)
head(trans)
## # A tibble: 6 × 8
##    DATE STORE_NBR LYLTY_CARD_NBR TXN_ID PROD_NBR PROD_NAME       PROD_…¹ TOT_S…²
##   <dbl>     <dbl>          <dbl>  <dbl>    <dbl> <chr>             <dbl>   <dbl>
## 1 43390         1           1000      1        5 Natural Chip  …       2     6  
## 2 43599         1           1307    348       66 CCs Nacho Chee…       3     6.3
## 3 43605         1           1343    383       61 Smiths Crinkle…       2     2.9
## 4 43329         2           2373    974       69 Smiths Chip Th…       5    15  
## 5 43330         2           2426   1038      108 Kettle Tortill…       3    13.8
## 6 43604         4           4074   2982       57 Old El Paso Sa…       1     5.1
## # … with abbreviated variable names ¹​PROD_QTY, ²​TOT_SALES
#showing summary
summary(trans)
##       DATE         STORE_NBR     LYLTY_CARD_NBR        TXN_ID       
##  Min.   :43282   Min.   :  1.0   Min.   :   1000   Min.   :      1  
##  1st Qu.:43373   1st Qu.: 70.0   1st Qu.:  70021   1st Qu.:  67602  
##  Median :43464   Median :130.0   Median : 130358   Median : 135138  
##  Mean   :43464   Mean   :135.1   Mean   : 135550   Mean   : 135158  
##  3rd Qu.:43555   3rd Qu.:203.0   3rd Qu.: 203094   3rd Qu.: 202701  
##  Max.   :43646   Max.   :272.0   Max.   :2373711   Max.   :2415841  
##     PROD_NBR       PROD_NAME            PROD_QTY         TOT_SALES      
##  Min.   :  1.00   Length:264836      Min.   :  1.000   Min.   :  1.500  
##  1st Qu.: 28.00   Class :character   1st Qu.:  2.000   1st Qu.:  5.400  
##  Median : 56.00   Mode  :character   Median :  2.000   Median :  7.400  
##  Mean   : 56.58                      Mean   :  1.907   Mean   :  7.304  
##  3rd Qu.: 85.00                      3rd Qu.:  2.000   3rd Qu.:  9.200  
##  Max.   :114.00                      Max.   :200.000   Max.   :650.000
#showing high level structure
str(trans)
## tibble [264,836 × 8] (S3: tbl_df/tbl/data.frame)
##  $ DATE          : num [1:264836] 43390 43599 43605 43329 43330 ...
##  $ STORE_NBR     : num [1:264836] 1 1 1 2 2 4 4 4 5 7 ...
##  $ LYLTY_CARD_NBR: num [1:264836] 1000 1307 1343 2373 2426 ...
##  $ TXN_ID        : num [1:264836] 1 348 383 974 1038 ...
##  $ PROD_NBR      : num [1:264836] 5 66 61 69 108 57 16 24 42 52 ...
##  $ PROD_NAME     : chr [1:264836] "Natural Chip        Compny SeaSalt175g" "CCs Nacho Cheese    175g" "Smiths Crinkle Cut  Chips Chicken 170g" "Smiths Chip Thinly  S/Cream&Onion 175g" ...
##  $ PROD_QTY      : num [1:264836] 2 3 2 5 3 1 1 1 1 2 ...
##  $ TOT_SALES     : num [1:264836] 6 6.3 2.9 15 13.8 5.1 5.7 3.6 3.9 7.2 ...

Convert DATE column to a date format

We can see that the DATE type is DOUBLE We need to convert it to DATE type CSV and Excel integer dates begin on 30 Dec 1899

typeof(trans$DATE)
## [1] "double"
trans$DATE <- as.Date(trans$DATE,origin = '1899-12-30')
typeof(trans$DATE)
## [1] "double"
#examine structure
str(trans)
## tibble [264,836 × 8] (S3: tbl_df/tbl/data.frame)
##  $ DATE          : Date[1:264836], format: "2018-10-17" "2019-05-14" ...
##  $ STORE_NBR     : num [1:264836] 1 1 1 2 2 4 4 4 5 7 ...
##  $ LYLTY_CARD_NBR: num [1:264836] 1000 1307 1343 2373 2426 ...
##  $ TXN_ID        : num [1:264836] 1 348 383 974 1038 ...
##  $ PROD_NBR      : num [1:264836] 5 66 61 69 108 57 16 24 42 52 ...
##  $ PROD_NAME     : chr [1:264836] "Natural Chip        Compny SeaSalt175g" "CCs Nacho Cheese    175g" "Smiths Crinkle Cut  Chips Chicken 170g" "Smiths Chip Thinly  S/Cream&Onion 175g" ...
##  $ PROD_QTY      : num [1:264836] 2 3 2 5 3 1 1 1 1 2 ...
##  $ TOT_SALES     : num [1:264836] 6 6.3 2.9 15 13.8 5.1 5.7 3.6 3.9 7.2 ...

Examine PROD_NAME

Since PROD_NAME is a name given to individual object, we will factorise it and make them into groups.

trans$PROD_NAME_FACTOR <- factor(trans$PROD_NAME)
summary(trans$PROD_NAME_FACTOR)
##   Kettle Mozzarella   Basil & Pesto 175g 
##                                     3304 
## Kettle Tortilla ChpsHny&Jlpno Chili 150g 
##                                     3296 
## Cobs Popd Swt/Chlli &Sr/Cream Chips 110g 
##                                     3269 
##   Tyrrells Crisps     Ched & Chives 165g 
##                                     3268 
##           Cobs Popd Sea Salt  Chips 110g 
##                                     3265 
##             Kettle 135g Swt Pot Sea Salt 
##                                     3257 
##            Tostitos Splash Of  Lime 175g 
##                                     3252 
## Infuzions Thai SweetChili PotatoMix 110g 
##                                     3242 
##   Smiths Crnkle Chip  Orgnl Big Bag 380g 
##                                     3233 
##     Thins Potato Chips  Hot & Spicy 175g 
##                                     3229 
## Kettle Sensations   Camembert & Fig 150g 
##                                     3219 
##  Doritos Corn Chips  Cheese Supreme 170g 
##                                     3217 
##                 Pringles Barbeque   134g 
##                                     3210 
##  Doritos Corn Chip Mexican Jalapeno 150g 
##                                     3204 
##  Kettle Sweet Chilli And Sour Cream 175g 
##                                     3200 
## Smiths Crinkle Chips Salt & Vinegar 330g 
##                                     3197 
##           Thins Chips Light&  Tangy 175g 
##                                     3188 
##         Dorito Corn Chp     Supreme 380g 
##                                     3185 
##             Pringles Sweet&Spcy BBQ 134g 
##                                     3177 
##  Infuzions BBQ Rib   Prawn Crackers 110g 
##                                     3174 
##  Tyrrells Crisps     Lightly Salted 165g 
##                                     3174 
##     Kettle Sea Salt     And Vinegar 175g 
##                                     3173 
##  Doritos Corn Chip Southern Chicken 150g 
##                                     3172 
##                     Twisties Chicken270g 
##                                     3170 
##          Twisties Cheese     Burger 250g 
##                                     3169 
##    Grain Waves         Sweet Chilli 210g 
##                                     3167 
##           Pringles SourCream  Onion 134g 
##                                     3162 
##    Doritos Corn Chips  Nacho Cheese 170g 
##                                     3160 
##   Cobs Popd Sour Crm  &Chives Chips 110g 
##                                     3159 
##                     Kettle Original 175g 
##                                     3159 
##          Pringles Original   Crisps 134g 
##                                     3157 
##                     Cheezels Cheese 330g 
##                                     3149 
##         Kettle Honey Soy    Chicken 175g 
##                                     3148 
##  Kettle Tortilla ChpsBtroot&Ricotta 150g 
##                                     3146 
##        Tostitos Smoked     Chipotle 175g 
##                                     3145 
##   Infzns Crn Crnchers Tangy Gcamole 110g 
##                                     3144 
##        Smiths Crinkle      Original 330g 
##                                     3142 
##     Kettle Tortilla ChpsFeta&Garlic 150g 
##                                     3138 
## Infuzions SourCream&Herbs Veg Strws 110g 
##                                     3134 
##    Kettle Sensations   Siracha Lime 150g 
##                                     3127 
## Old El Paso Salsa   Dip Chnky Tom Ht300g 
##                                     3125 
##        Doritos Corn Chips  Original 170g 
##                                     3121 
##                 Doritos Mexicana    170g 
##                                     3115 
##                 Twisties Cheese     270g 
##                                     3115 
##  Old El Paso Salsa   Dip Tomato Med 300g 
##                                     3114 
##         Pringles Mystery    Flavour 134g 
##                                     3114 
##         Thins Chips Seasonedchicken 175g 
##                                     3114 
##    Grain Waves Sour    Cream&Chives 210G 
##                                     3105 
##      Pringles Chicken    Salt Crips 134g 
##                                     3104 
##         Thins Chips Salt &  Vinegar 175g 
##                                     3103 
##                 Pringles Slt Vingar 134g 
##                                     3095 
## Old El Paso Salsa   Dip Tomato Mild 300g 
##                                     3085 
##       Kettle Sensations   BBQ&Maple 150g 
##                                     3083 
##         Pringles Sthrn FriedChicken 134g 
##                                     3083 
##          Tostitos Lightly    Salted 175g 
##                                     3074 
##         Doritos Cheese      Supreme 330g 
##                                     3052 
##                       Kettle Chilli 175g 
##                                     3038 
##    Smiths Chip Thinly  Cut Original 175g 
##                                     1614 
##     Snbts Whlgrn Crisps Cheddr&Mstrd 90g 
##                                     1576 
##  Natural Chip Co     Tmato Hrb&Spce 175g 
##                                     1572 
##                        Burger Rings 220g 
##                                     1564 
##   Natural ChipCo Sea  Salt & Vinegr 175g 
##                                     1550 
##                 CCs Tasty Cheese    175g 
##                                     1539 
##      RRD SR Slow Rst     Pork Belly 150g 
##                                     1526 
##   Smiths Thinly Cut   Roast Chicken 175g 
##                                     1519 
##      RRD Sweet Chilli &  Sour Cream 165g 
##                                     1516 
##           Woolworths Cheese   Rings 190g 
##                                     1516 
##                        CCs Original 175g 
##                                     1514 
##         RRD Honey Soy       Chicken 165g 
##                                     1513 
##    Smith Crinkle Cut   Mac N Cheese 150g 
##                                     1512 
##      WW Supreme Cheese   Corn Chips 200g 
##                                     1509 
##  Infuzions Mango     Chutny Papadums 70g 
##                                     1507 
##         RRD Chilli&         Coconut 150g 
##                                     1506 
##      Smiths Crinkle Cut  Snag&Sauce 150g 
##                                     1503 
##                 CCs Nacho Cheese    175g 
##                                     1498 
##  Red Rock Deli Sp    Salt & Truffle 150G 
##                                     1498 
##     Red Rock Deli Thai  Chilli&Lime 150g 
##                                     1495 
##           WW Original Corn    Chips 200g 
##                                     1495 
##           Woolworths Mild     Salsa 300g 
##                                     1491 
##  Smiths Crinkle Cut  Chips Barbecue 170g 
##                                     1489 
##           WW Original Stacked Chips 160g 
##                                     1487 
##   Smiths Crinkle Cut  Chips Chicken 170g 
##                                     1484 
##   WW Sour Cream &OnionStacked Chips 160g 
##                                     1483 
##  Smiths Crinkle Cut  Chips Chs&Onion170g 
##                                     1481 
##           Cheetos Chs & Bacon Balls 190g 
##                                     1479 
##                 RRD Salt & Vinegar  165g 
##                                     1474 
##                 RRD Lime & Pepper   165g 
##                                     1473 
##   Smiths Chip Thinly  S/Cream&Onion 175g 
##                                     1473 
##                 Doritos Salsa Mild  300g 
##                                     1472 
##    Smiths Crinkle Cut  Tomato Salsa 150g 
##                                     1470 
##        WW D/Style Chip     Sea Salt 200g 
##                                     1469 
##    GrnWves Plus Btroot & Chilli Jam 180g 
##                                     1468 
##   Natural Chip        Compny SeaSalt175g 
##                                     1468 
##         WW Crinkle Cut      Chicken 175g 
##                                     1467 
##  Smiths Crinkle Cut  Chips Original 170g 
##                                     1461 
## Smiths Thinly       Swt Chli&S/Cream175G 
##                                     1461 
##   Natural ChipCo      Hony Soy Chckn175g 
##                                     1460 
## Red Rock Deli SR    Salsa & Mzzrlla 150g 
##                                     1458 
##     RRD Steak &         Chimuchurri 150g 
##                                     1455 
##                                  (Other) 
##                                    21550
summary(trans)
##       DATE              STORE_NBR     LYLTY_CARD_NBR        TXN_ID       
##  Min.   :2018-07-01   Min.   :  1.0   Min.   :   1000   Min.   :      1  
##  1st Qu.:2018-09-30   1st Qu.: 70.0   1st Qu.:  70021   1st Qu.:  67602  
##  Median :2018-12-30   Median :130.0   Median : 130358   Median : 135138  
##  Mean   :2018-12-30   Mean   :135.1   Mean   : 135550   Mean   : 135158  
##  3rd Qu.:2019-03-31   3rd Qu.:203.0   3rd Qu.: 203094   3rd Qu.: 202701  
##  Max.   :2019-06-30   Max.   :272.0   Max.   :2373711   Max.   :2415841  
##                                                                          
##     PROD_NBR       PROD_NAME            PROD_QTY         TOT_SALES      
##  Min.   :  1.00   Length:264836      Min.   :  1.000   Min.   :  1.500  
##  1st Qu.: 28.00   Class :character   1st Qu.:  2.000   1st Qu.:  5.400  
##  Median : 56.00   Mode  :character   Median :  2.000   Median :  7.400  
##  Mean   : 56.58                      Mean   :  1.907   Mean   :  7.304  
##  3rd Qu.: 85.00                      3rd Qu.:  2.000   3rd Qu.:  9.200  
##  Max.   :114.00                      Max.   :200.000   Max.   :650.000  
##                                                                         
##                                  PROD_NAME_FACTOR 
##  Kettle Mozzarella   Basil & Pesto 175g  :  3304  
##  Kettle Tortilla ChpsHny&Jlpno Chili 150g:  3296  
##  Cobs Popd Swt/Chlli &Sr/Cream Chips 110g:  3269  
##  Tyrrells Crisps     Ched & Chives 165g  :  3268  
##  Cobs Popd Sea Salt  Chips 110g          :  3265  
##  Kettle 135g Swt Pot Sea Salt            :  3257  
##  (Other)                                 :245177
str(trans)
## tibble [264,836 × 9] (S3: tbl_df/tbl/data.frame)
##  $ DATE            : Date[1:264836], format: "2018-10-17" "2019-05-14" ...
##  $ STORE_NBR       : num [1:264836] 1 1 1 2 2 4 4 4 5 7 ...
##  $ LYLTY_CARD_NBR  : num [1:264836] 1000 1307 1343 2373 2426 ...
##  $ TXN_ID          : num [1:264836] 1 348 383 974 1038 ...
##  $ PROD_NBR        : num [1:264836] 5 66 61 69 108 57 16 24 42 52 ...
##  $ PROD_NAME       : chr [1:264836] "Natural Chip        Compny SeaSalt175g" "CCs Nacho Cheese    175g" "Smiths Crinkle Cut  Chips Chicken 170g" "Smiths Chip Thinly  S/Cream&Onion 175g" ...
##  $ PROD_QTY        : num [1:264836] 2 3 2 5 3 1 1 1 1 2 ...
##  $ TOT_SALES       : num [1:264836] 6 6.3 2.9 15 13.8 5.1 5.7 3.6 3.9 7.2 ...
##  $ PROD_NAME_FACTOR: Factor w/ 114 levels "Burger Rings 220g",..: 44 2 80 76 43 51 78 23 14 24 ...

Text analysis

Examine product words in PROD_NAME

library(data.table)
productWords <- data.table(unlist(strsplit(unique(trans$PROD_NAME), " ")))
setnames(productWords, 'words')
#productWords

Removing words that contain numerical

numerical.validation <- grepl('[1-9]',productWords[,words])
productWords <- productWords[numerical.validation==FALSE]
#productWords

Removing words that contain special character ‘&’

scAnd.validation <- grepl('&',productWords[,words])
productWords <- productWords[scAnd.validation==FALSE]

Removing words that contain special character ‘/’

sc.validation <- grepl('/',productWords[,words])
productWords <- productWords[sc.validation==FALSE]

Counting frequencies

#factorising words
productWords <- factor(productWords$words)

summary

summary(productWords)
##                   Chips      Smiths     Crinkle         Cut      Kettle 
##         234          21          16          14          14          13 
##      Cheese        Salt    Original        Chip     Doritos       Salsa 
##          12          12          10           9           9           9 
##        Corn    Pringles         RRD     Chicken          WW         Sea 
##           8           8           8           7           7           6 
##        Sour      Chilli      Crisps      Thinly       Thins     Vinegar 
##           6           5           5           5           5           5 
##       Cream        Deli   Infuzions     Natural         Red        Rock 
##           4           4           4           4           4           4 
##     Supreme         CCs        Cobs         Dip          El        Lime 
##           4           3           3           3           3           3 
##        Mild         Old        Paso        Popd  Sensations         Soy 
##           3           3           3           3           3           3 
##       Sweet      Tomato    Tortilla    Tostitos    Twisties  Woolworths 
##           3           3           3           3           3           3 
##         And         BBQ      Burger     Cheetos    Cheezels      ChipCo 
##           2           2           2           2           2           2 
##      Chives      French       Grain       Honey     Lightly      Medium 
##           2           2           2           2           2           2 
##       Nacho      Potato       Rings      Salted       Smith          SR 
##           2           2           2           2           2           2 
##         Swt       Tangy        Thai    Tyrrells       Waves       Aioli 
##           2           2           2           2           2           1 
##       Bacon         Bag       Balls    Barbecue    Barbeque       Basil 
##           1           1           1           1           1           1 
##       Belly         Big   Bolognese         Box      Btroot   Camembert 
##           1           1           1           1           1           1 
##        Ched       Chili Chimuchurri    Chipotle       Chnky         Chp 
##           1           1           1           1           1           1 
##         Chs      Chutny          Co     Coconut      Compny    Crackers 
##           1           1           1           1           1           1 
##       Crips         Crm         Crn     (Other) 
##           1           1           1          70
trans <- data.table(trans)

Remove salsa products

# Remove salsa products
trans[, SALSA := grepl("salsa", tolower(PROD_NAME))]
trans <- trans[SALSA == FALSE, ][, SALSA := NULL]

Summarise the data to check for nulls and possible outliers

summary(trans)
##       DATE              STORE_NBR     LYLTY_CARD_NBR        TXN_ID       
##  Min.   :2018-07-01   Min.   :  1.0   Min.   :   1000   Min.   :      1  
##  1st Qu.:2018-09-30   1st Qu.: 70.0   1st Qu.:  70015   1st Qu.:  67569  
##  Median :2018-12-30   Median :130.0   Median : 130367   Median : 135183  
##  Mean   :2018-12-30   Mean   :135.1   Mean   : 135531   Mean   : 135131  
##  3rd Qu.:2019-03-31   3rd Qu.:203.0   3rd Qu.: 203084   3rd Qu.: 202654  
##  Max.   :2019-06-30   Max.   :272.0   Max.   :2373711   Max.   :2415841  
##                                                                          
##     PROD_NBR       PROD_NAME            PROD_QTY         TOT_SALES      
##  Min.   :  1.00   Length:246742      Min.   :  1.000   Min.   :  1.700  
##  1st Qu.: 26.00   Class :character   1st Qu.:  2.000   1st Qu.:  5.800  
##  Median : 53.00   Mode  :character   Median :  2.000   Median :  7.400  
##  Mean   : 56.35                      Mean   :  1.908   Mean   :  7.321  
##  3rd Qu.: 87.00                      3rd Qu.:  2.000   3rd Qu.:  8.800  
##  Max.   :114.00                      Max.   :200.000   Max.   :650.000  
##                                                                         
##                                  PROD_NAME_FACTOR 
##  Kettle Mozzarella   Basil & Pesto 175g  :  3304  
##  Kettle Tortilla ChpsHny&Jlpno Chili 150g:  3296  
##  Cobs Popd Swt/Chlli &Sr/Cream Chips 110g:  3269  
##  Tyrrells Crisps     Ched & Chives 165g  :  3268  
##  Cobs Popd Sea Salt  Chips 110g          :  3265  
##  Kettle 135g Swt Pot Sea Salt            :  3257  
##  (Other)                                 :227083

Checking for outliers

By seeing summary of data, we can see that the maximum value of PROD_QTY is more that (3rd quartile + 1.5*IQR)

Lets confirm this with a boxplot

boxplot(trans$PROD_QTY)

Yes we can confirm existance of outliers.

library(dplyr)
filter(trans,trans$PROD_QTY==200)
##          DATE STORE_NBR LYLTY_CARD_NBR TXN_ID PROD_NBR
## 1: 2018-08-19       226         226000 226201        4
## 2: 2019-05-20       226         226000 226210        4
##                           PROD_NAME PROD_QTY TOT_SALES
## 1: Dorito Corn Chp     Supreme 380g      200       650
## 2: Dorito Corn Chp     Supreme 380g      200       650
##                    PROD_NAME_FACTOR
## 1: Dorito Corn Chp     Supreme 380g
## 2: Dorito Corn Chp     Supreme 380g

We have 2 records where the PROD_QTY is 200. Both are made by same customer 226000. Let’s see if he has any other transactions

filter(trans,trans$LYLTY_CARD_NBR==226000)
##          DATE STORE_NBR LYLTY_CARD_NBR TXN_ID PROD_NBR
## 1: 2018-08-19       226         226000 226201        4
## 2: 2019-05-20       226         226000 226210        4
##                           PROD_NAME PROD_QTY TOT_SALES
## 1: Dorito Corn Chp     Supreme 380g      200       650
## 2: Dorito Corn Chp     Supreme 380g      200       650
##                    PROD_NAME_FACTOR
## 1: Dorito Corn Chp     Supreme 380g
## 2: Dorito Corn Chp     Supreme 380g

It looks like this customer has only had the two transactions over the year and is not an ordinary retail customer. The customer might be buying chips for commercial purposes instead. We’ll remove this loyalty card number from further analysis.

Filter out the customer based on the loyalty card number

Removing customer - 226000 from further analysis

#trans[,trans$LYLTY_CARD_NBR != 226000]
trans <- trans[trans[,trans$LYLTY_CARD_NBR != 226000]]

Re-examine transaction data

summary(trans)
##       DATE              STORE_NBR     LYLTY_CARD_NBR        TXN_ID       
##  Min.   :2018-07-01   Min.   :  1.0   Min.   :   1000   Min.   :      1  
##  1st Qu.:2018-09-30   1st Qu.: 70.0   1st Qu.:  70015   1st Qu.:  67569  
##  Median :2018-12-30   Median :130.0   Median : 130367   Median : 135182  
##  Mean   :2018-12-30   Mean   :135.1   Mean   : 135530   Mean   : 135130  
##  3rd Qu.:2019-03-31   3rd Qu.:203.0   3rd Qu.: 203083   3rd Qu.: 202652  
##  Max.   :2019-06-30   Max.   :272.0   Max.   :2373711   Max.   :2415841  
##                                                                          
##     PROD_NBR       PROD_NAME            PROD_QTY       TOT_SALES     
##  Min.   :  1.00   Length:246740      Min.   :1.000   Min.   : 1.700  
##  1st Qu.: 26.00   Class :character   1st Qu.:2.000   1st Qu.: 5.800  
##  Median : 53.00   Mode  :character   Median :2.000   Median : 7.400  
##  Mean   : 56.35                      Mean   :1.906   Mean   : 7.316  
##  3rd Qu.: 87.00                      3rd Qu.:2.000   3rd Qu.: 8.800  
##  Max.   :114.00                      Max.   :5.000   Max.   :29.500  
##                                                                      
##                                  PROD_NAME_FACTOR 
##  Kettle Mozzarella   Basil & Pesto 175g  :  3304  
##  Kettle Tortilla ChpsHny&Jlpno Chili 150g:  3296  
##  Cobs Popd Swt/Chlli &Sr/Cream Chips 110g:  3269  
##  Tyrrells Crisps     Ched & Chives 165g  :  3268  
##  Cobs Popd Sea Salt  Chips 110g          :  3265  
##  Kettle 135g Swt Pot Sea Salt            :  3257  
##  (Other)                                 :227081
boxplot(trans$PROD_QTY)

Count the number of transactions by date

Let us factorise the dates

trans$newDATE <- factor(trans$DATE)

Summary

summary(trans)
##       DATE              STORE_NBR     LYLTY_CARD_NBR        TXN_ID       
##  Min.   :2018-07-01   Min.   :  1.0   Min.   :   1000   Min.   :      1  
##  1st Qu.:2018-09-30   1st Qu.: 70.0   1st Qu.:  70015   1st Qu.:  67569  
##  Median :2018-12-30   Median :130.0   Median : 130367   Median : 135182  
##  Mean   :2018-12-30   Mean   :135.1   Mean   : 135530   Mean   : 135130  
##  3rd Qu.:2019-03-31   3rd Qu.:203.0   3rd Qu.: 203083   3rd Qu.: 202652  
##  Max.   :2019-06-30   Max.   :272.0   Max.   :2373711   Max.   :2415841  
##                                                                          
##     PROD_NBR       PROD_NAME            PROD_QTY       TOT_SALES     
##  Min.   :  1.00   Length:246740      Min.   :1.000   Min.   : 1.700  
##  1st Qu.: 26.00   Class :character   1st Qu.:2.000   1st Qu.: 5.800  
##  Median : 53.00   Mode  :character   Median :2.000   Median : 7.400  
##  Mean   : 56.35                      Mean   :1.906   Mean   : 7.316  
##  3rd Qu.: 87.00                      3rd Qu.:2.000   3rd Qu.: 8.800  
##  Max.   :114.00                      Max.   :5.000   Max.   :29.500  
##                                                                      
##                                  PROD_NAME_FACTOR        newDATE      
##  Kettle Mozzarella   Basil & Pesto 175g  :  3304   2018-12-24:   865  
##  Kettle Tortilla ChpsHny&Jlpno Chili 150g:  3296   2018-12-23:   853  
##  Cobs Popd Swt/Chlli &Sr/Cream Chips 110g:  3269   2018-12-22:   840  
##  Tyrrells Crisps     Ched & Chives 165g  :  3268   2018-12-19:   839  
##  Cobs Popd Sea Salt  Chips 110g          :  3265   2018-12-20:   808  
##  Kettle 135g Swt Pot Sea Salt            :  3257   2018-12-18:   799  
##  (Other)                                 :227081   (Other)   :241736
str(trans)
## Classes 'data.table' and 'data.frame':   246740 obs. of  10 variables:
##  $ DATE            : Date, format: "2018-10-17" "2019-05-14" ...
##  $ STORE_NBR       : num  1 1 1 2 2 4 4 5 7 7 ...
##  $ LYLTY_CARD_NBR  : num  1000 1307 1343 2373 2426 ...
##  $ TXN_ID          : num  1 348 383 974 1038 ...
##  $ PROD_NBR        : num  5 66 61 69 108 16 24 42 52 16 ...
##  $ PROD_NAME       : chr  "Natural Chip        Compny SeaSalt175g" "CCs Nacho Cheese    175g" "Smiths Crinkle Cut  Chips Chicken 170g" "Smiths Chip Thinly  S/Cream&Onion 175g" ...
##  $ PROD_QTY        : num  2 3 2 5 3 1 1 1 2 1 ...
##  $ TOT_SALES       : num  6 6.3 2.9 15 13.8 5.7 3.6 3.9 7.2 5.7 ...
##  $ PROD_NAME_FACTOR: Factor w/ 114 levels "Burger Rings 220g",..: 44 2 80 76 43 78 23 14 24 78 ...
##  $ newDATE         : Factor w/ 364 levels "2018-07-01","2018-07-02",..: 109 317 323 48 49 319 319 51 49 320 ...
##  - attr(*, ".internal.selfref")=<externalptr>

There are 364 unique dates where transaction happened. We will create a new column with dates from min to max i.e., 2018-07-01 to 2019-06-30 and then join this with trans to find that missing date.

model.date <- seq(as.Date("2018-07-01"),as.Date("2019-06-30"),by = 'day')
model.date <- data.table(model.date)
setnames(model.date,'DATE')
#colnames(model.date) <- c('Date')

joining

trans <- full_join(trans,model.date,by = c('DATE'))

summary

summary(trans)
##       DATE              STORE_NBR     LYLTY_CARD_NBR        TXN_ID       
##  Min.   :2018-07-01   Min.   :  1.0   Min.   :   1000   Min.   :      1  
##  1st Qu.:2018-09-30   1st Qu.: 70.0   1st Qu.:  70015   1st Qu.:  67569  
##  Median :2018-12-30   Median :130.0   Median : 130367   Median : 135182  
##  Mean   :2018-12-30   Mean   :135.1   Mean   : 135530   Mean   : 135130  
##  3rd Qu.:2019-03-31   3rd Qu.:203.0   3rd Qu.: 203083   3rd Qu.: 202652  
##  Max.   :2019-06-30   Max.   :272.0   Max.   :2373711   Max.   :2415841  
##                       NA's   :1       NA's   :1         NA's   :1        
##     PROD_NBR       PROD_NAME            PROD_QTY       TOT_SALES     
##  Min.   :  1.00   Length:246741      Min.   :1.000   Min.   : 1.700  
##  1st Qu.: 26.00   Class :character   1st Qu.:2.000   1st Qu.: 5.800  
##  Median : 53.00   Mode  :character   Median :2.000   Median : 7.400  
##  Mean   : 56.35                      Mean   :1.906   Mean   : 7.316  
##  3rd Qu.: 87.00                      3rd Qu.:2.000   3rd Qu.: 8.800  
##  Max.   :114.00                      Max.   :5.000   Max.   :29.500  
##  NA's   :1                           NA's   :1       NA's   :1       
##                                  PROD_NAME_FACTOR        newDATE      
##  Kettle Mozzarella   Basil & Pesto 175g  :  3304   2018-12-24:   865  
##  Kettle Tortilla ChpsHny&Jlpno Chili 150g:  3296   2018-12-23:   853  
##  Cobs Popd Swt/Chlli &Sr/Cream Chips 110g:  3269   2018-12-22:   840  
##  Tyrrells Crisps     Ched & Chives 165g  :  3268   2018-12-19:   839  
##  Cobs Popd Sea Salt  Chips 110g          :  3265   2018-12-20:   808  
##  (Other)                                 :230338   (Other)   :242535  
##  NA's                                    :     1   NA's      :     1

finding the date

filter(trans,is.na(trans$STORE_NBR) == TRUE)
##          DATE STORE_NBR LYLTY_CARD_NBR TXN_ID PROD_NBR PROD_NAME PROD_QTY
## 1: 2018-12-25        NA             NA     NA       NA      <NA>       NA
##    TOT_SALES PROD_NAME_FACTOR newDATE
## 1:        NA             <NA>    <NA>

We can see that the date 2018-12-25 is missing.

Count the number of transactions by date

transactions_per_date <- trans[, as.Date(trans$DATE, format = "%Y-%m-%d")]
transactions_per_date <- table(transactions_per_date)
transactions_per_date <- data.table(transactions_per_date)

Plot transactions over time

library(ggplot2)
#### Setting plot themes to format graphs
theme_set(theme_bw())
theme_update(plot.title = element_text(hjust = 0.5))
#### Plot transactions over time
ggplot(transactions_per_date, aes(x = as.Date(transactions_per_date), y = N)) +
 geom_line() +
 labs(x = "Day", y = "Number of transactions", title = "Transactions over time") +
 scale_x_date(breaks = "1 month") +
 theme(axis.text.x = element_text(angle = 90, vjust = 0.5))

Filter to December and look at individual days

We can see some anomaly in December. Creating December chart to further investigate.

x = subset(transactions_per_date, format.Date(transactions_per_date,"%m")=="12")
ggplot(x, aes(x = as.Date(transactions_per_date), y = N)) +
 geom_line() +
 labs(x = "Day", y = "Number of transactions", title = "Transactions over time") +
 scale_x_date(breaks = "1 day") +
 theme(axis.text.x = element_text(angle = 90, vjust = 0.5))

We can see that on Dec 25 we do not have any transaction. Because it was a missing value. Sales got increased until Christmas day and on Christmas day shops were closed.

# removing the Christmas day
trans <- subset(trans,trans$DATE != '2018-12-25')

Feature Engineering

Creating new features: Pack size

library(readr)
# creating new Pack size feature in trans by parasing our numbers from product names.
trans[, PACK_SIZE := parse_number(PROD_NAME)]
# Always check your output
# Let's check if the pack sizes look sensible
#.N is a spl variable in data.table used to represent # of observations in a group along with by = pack_size
trans[, .N, PACK_SIZE][order(PACK_SIZE)]
##     PACK_SIZE     N
##  1:        70  1507
##  2:        90  3008
##  3:       110 22387
##  4:       125  1454
##  5:       134 25102
##  6:       135  3257
##  7:       150 40203
##  8:       160  2970
##  9:       165 15297
## 10:       170 19983
## 11:       175 66390
## 12:       180  1468
## 13:       190  2995
## 14:       200  4473
## 15:       210  6272
## 16:       220  1564
## 17:       250  3169
## 18:       270  6285
## 19:       330 12540
## 20:       380  6416

The largest size is 380g and the smallest size is 70g - seems sensible!

Histogram of Pack size.

x1 <- trans$PACK_SIZE
x1 <- table(x1)
x1<- data.table(x1)
colnames(x1) <- c('Pack_size','Transactions')

barplot(height = x1$Transactions,
        names.arg = x1$Pack_size,
        main="Histogram of Pack_size to Transactions",
        xlab = "Pack size", 
        ylab= "# Transactions")

Creating new features: Brand_name

#Here we are parsing the first word of the sentence using word() from stringr
trans$Brand_name <- word(trans$PROD_NAME, 1)

#checking brands results
trans[, .N, Brand_name][order(Brand_name)]
##     Brand_name     N
##  1:     Burger  1564
##  2:        CCs  4551
##  3:    Cheetos  2927
##  4:   Cheezels  4603
##  5:       Cobs  9693
##  6:     Dorito  3183
##  7:    Doritos 22041
##  8:     French  1418
##  9:      Grain  6272
## 10:    GrnWves  1468
## 11:  Infuzions 11057
## 12:     Infzns  3144
## 13:     Kettle 41288
## 14:        NCC  1419
## 15:    Natural  6050
## 16:   Pringles 25102
## 17:        RRD 11894
## 18:        Red  4427
## 19:      Smith  2963
## 20:     Smiths 27390
## 21:      Snbts  1576
## 22:   Sunbites  1432
## 23:      Thins 14075
## 24:   Tostitos  9471
## 25:   Twisties  9454
## 26:   Tyrrells  6442
## 27:         WW 10320
## 28: Woolworths  1516
##     Brand_name     N

Clean brand names

Some of the brand names look like they are of the same brands - such as RED and RRD, NCC and Natural Chip Co, Smith and Smiths, infuzions and infzns, Snbts and Sunbites, WW and Woolworths, Dorito and Doritos, Grain and GrnWves Let’s combine these together.

#clean brand names
trans[Brand_name == "Red", Brand_name := "RRD"]
trans[Brand_name == "Dorito", Brand_name := "Doritos"]
trans[Brand_name == "GrnWves", Brand_name := "Grain Waves"]
trans[Brand_name == "Grain", Brand_name := "Grain Waves"]
trans[Brand_name == "Natural", Brand_name := "NCC"]
trans[Brand_name == "Smith", Brand_name := "Smiths"]
trans[Brand_name == "Infzns", Brand_name := "Infuzions"]
trans[Brand_name == "Snbts", Brand_name := "Sunbites"]
trans[Brand_name == "Woolworths", Brand_name := "WW"]
#checking brands results
trans[, .N, Brand_name][order(Brand_name)]
##      Brand_name     N
##  1:      Burger  1564
##  2:         CCs  4551
##  3:     Cheetos  2927
##  4:    Cheezels  4603
##  5:        Cobs  9693
##  6:     Doritos 25224
##  7:      French  1418
##  8: Grain Waves  7740
##  9:   Infuzions 14201
## 10:      Kettle 41288
## 11:         NCC  7469
## 12:    Pringles 25102
## 13:         RRD 16321
## 14:      Smiths 30353
## 15:    Sunbites  3008
## 16:       Thins 14075
## 17:    Tostitos  9471
## 18:    Twisties  9454
## 19:    Tyrrells  6442
## 20:          WW 11836

Loading dataset

cust = read.csv('QVI_purchase_behaviour.csv')

Examining customer data

summary(cust)
##  LYLTY_CARD_NBR     LIFESTAGE         PREMIUM_CUSTOMER  
##  Min.   :   1000   Length:72637       Length:72637      
##  1st Qu.:  66202   Class :character   Class :character  
##  Median : 134040   Mode  :character   Mode  :character  
##  Mean   : 136186                                        
##  3rd Qu.: 203375                                        
##  Max.   :2373711
str(cust)
## 'data.frame':    72637 obs. of  3 variables:
##  $ LYLTY_CARD_NBR  : int  1000 1002 1003 1004 1005 1007 1009 1010 1011 1012 ...
##  $ LIFESTAGE       : chr  "YOUNG SINGLES/COUPLES" "YOUNG SINGLES/COUPLES" "YOUNG FAMILIES" "OLDER SINGLES/COUPLES" ...
##  $ PREMIUM_CUSTOMER: chr  "Premium" "Mainstream" "Budget" "Mainstream" ...
#distribution of lifestage and premium_customer
ggplot(data = cust,aes(x = LIFESTAGE))+geom_histogram(stat = "count")
## Warning in geom_histogram(stat = "count"): Ignoring unknown parameters:
## `binwidth`, `bins`, and `pad`

ggplot(data = cust,aes(x = PREMIUM_CUSTOMER))+geom_histogram(stat = "count")
## Warning in geom_histogram(stat = "count"): Ignoring unknown parameters:
## `binwidth`, `bins`, and `pad`

Merge transaction data to customer data

#### Merge transaction data to customer data
# all.x = T implies full left join
df <- merge(trans, cust, all.x = TRUE)

#df$LYLTY_CARD_NBR <- as.factor(df$LYLTY_CARD_NBR)

As the number of rows in data is the same as that of transactionData, we can be sure that no duplicates were created. This is because we created data by setting all.x = TRUE (in other words, a left join) which means take all the rows in transactionData and find rows with matching values in shared columns and then joining the details in these rows to the x or the first mentioned table.

Checking if any transactions did not have a matched customer.

df <- df[!DATE == '2018-12-25']

Saving dataset

filePath <- "/Users/santosh/Documents/QuantiumDA/quantiumDA/"
fwrite(df, paste0(filePath,"QVI_data.csv"))

Data exploration is now complete!

Data analysis on customer segments

Total sales by LIFESTAGE and PREMIUM_CUSTOMER

ggplot(data = df,aes(x = LIFESTAGE,y = TOT_SALES, fill = PREMIUM_CUSTOMER)) + 
  geom_bar(stat = 'identity', position = "dodge") 

Sales are coming mainly from Budget - older families, Mainstream - young singles/couples, and Mainstream - retirees

same output another method ( creating separate df of the required data)

sales_summary <- df %>%
  group_by(LIFESTAGE, PREMIUM_CUSTOMER) %>%
  summarise(total_sales = sum(TOT_SALES))
## `summarise()` has grouped output by 'LIFESTAGE'. You can override using the
## `.groups` argument.
ggplot(sales_summary, aes(x = LIFESTAGE, y = total_sales, fill = PREMIUM_CUSTOMER)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Total Sales by LIFESTAGE and PREMIUM_CUSTOMER") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Number of customers by LIFESTAGE and PREMIUM_CUSTOMER

cust_summary <- df %>% group_by(LIFESTAGE, PREMIUM_CUSTOMER) %>%
  summarise(cust_count = n())
## `summarise()` has grouped output by 'LIFESTAGE'. You can override using the
## `.groups` argument.
#n() is a function in dplyr that counts the number of observations in a group.
ggplot(cust_summary, aes(x = LIFESTAGE, y = cust_count, fill = PREMIUM_CUSTOMER)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Number of Customers by LIFESTAGE and PREMIUM_CUSTOMER") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

There are more Mainstream - young singles/couples and Mainstream - retirees who buy chips. This contributes to there being more sales to these customer segments but this is not a major driver for the Budget - Older families segment.

Higher sales may also be driven by more units of chips being bought per customer. Let’s have a look at this next.

Average number of units per customer by LIFESTAGE and PREMIUM_CUSTOMER

avg_cust <- df %>% group_by(LIFESTAGE , PREMIUM_CUSTOMER ) %>% 
  summarize(avg_cust_count = mean(PROD_QTY))
## `summarise()` has grouped output by 'LIFESTAGE'. You can override using the
## `.groups` argument.
#### Average number of units per customer by LIFESTAGE and PREMIUM_CUSTOMER
ggplot(data = avg_cust,aes(x = LIFESTAGE,y = avg_cust_count,fill = PREMIUM_CUSTOMER)) +   geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Average number of units per customer by LIFESTAGE and PREMIUM_CUSTOMER") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Older families and young families in general buy more chips per customer.

Let’s also investigate the average price per unit chips bought for each customer segment as this is also a driver of total sales.

Average price per unit by LIFESTAGE and PREMIUM_CUSTOMER

avg_ppu <- df %>% group_by(LIFESTAGE , PREMIUM_CUSTOMER) %>%
  summarise(avg_price = (sum(TOT_SALES)/sum(PROD_QTY)))
## `summarise()` has grouped output by 'LIFESTAGE'. You can override using the
## `.groups` argument.
#### Average price per unit by LIFESTAGE and PREMIUM_CUSTOMER
ggplot(data = avg_ppu,aes(x = LIFESTAGE,y = avg_price,fill = PREMIUM_CUSTOMER)) +   geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Average price per unit by LIFESTAGE and PREMIUM_CUSTOMER") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Mainstream midage and young singles and couples are more willing to pay more per packet of chips compared to their budget and premium counterparts. This may be due to premium shoppers being more likely to buy healthy snacks and when they buy chips, this is mainly for entertainment purposes rather than their own consumption. This is also supported by there being fewer premium midage and young singles and couples buying chips compared to their mainstream counterparts.

Perform an independent t-test

Here we are performing t-test between Mainstream vs (Premium or Budget) for Midage and young - singles and couples.

#### Perform an independent t-test between mainstream vs premium and budget midage and young singles and couples
# Over to you! Perform a t-test to see if the difference is significant.
main_premium <- subset(avg_ppu, PREMIUM_CUSTOMER %in% c("Mainstream", "Premium")&LIFESTAGE %in% c("YOUNG SINGLES/COUPLES","MIDAGE SINGLES/COUPLES"))

t.test(avg_price~PREMIUM_CUSTOMER, data = main_premium)
## 
##  Welch Two Sample t-test
## 
## data:  avg_price by PREMIUM_CUSTOMER
## t = 5.0147, df = 1.9805, p-value = 0.0383
## alternative hypothesis: true difference in means between group Mainstream and group Premium is not equal to 0
## 95 percent confidence interval:
##  0.03980094 0.55497920
## sample estimates:
## mean in group Mainstream    mean in group Premium 
##                 4.034246                 3.736856
#### Perform an independent t-test between mainstream vs premium and budget midage and young singles and couples
# Over to you! Perform a t-test to see if the difference is significant.
main_budget <- subset(avg_ppu, PREMIUM_CUSTOMER %in% c("Mainstream","Budget")&LIFESTAGE %in% c("YOUNG SINGLES/COUPLES","MIDAGE SINGLES/COUPLES"))

t.test(avg_price~PREMIUM_CUSTOMER, data = main_budget)
## 
##  Welch Two Sample t-test
## 
## data:  avg_price by PREMIUM_CUSTOMER
## t = -5.9898, df = 1.9572, p-value = 0.02815
## alternative hypothesis: true difference in means between group Budget and group Mainstream is not equal to 0
## 95 percent confidence interval:
##  -0.5454862 -0.0838308
## sample estimates:
##     mean in group Budget mean in group Mainstream 
##                 3.719587                 4.034246

Both the t-test results suggest that p-value is less than \(\alpha = 0.05\) and we accept alternate hypothesis that there is some difference between mean between the groups mainstream vs premium or budget.

the unit price for mainstream, young and mid-age singles and couples are significantly higher than that of budget or premium, young and midage singles and couples.

Deep dive into Mainstream, young singles/couples

We might want to target customer segments that contribute the most to sales to retain them or further increase sales. Let’s look at Mainstream - young singles/couples. For instance, let’s find out if they tend to buy a particular brand of chips.

#get brand baskets
Baskets <- df %>% group_by(PREMIUM_CUSTOMER == "Mainstream" & LIFESTAGE == "YOUNG SINGLES/COUPLES") %>%
  summarise(basket = list(Brand_name))

str(Baskets)
## tibble [2 × 2] (S3: tbl_df/tbl/data.frame)
##  $ PREMIUM_CUSTOMER == "Mainstream" & ...: logi [1:2] FALSE TRUE
##  $ basket                                :List of 2
##   ..$ : chr [1:227196] "NCC" "Grain Waves" "NCC" "WW" ...
##   ..$ : chr [1:19544] "RRD" "Doritos" "Kettle" "RRD" ...
#compute transactions
transx <- as(Baskets$basket,"transactions")
## Warning in asMethod(object): removing duplicated items in transactions
# Perform affinity analysis using Apriori algorithm
rules <- apriori(transx, parameter = list(supp = 0.5, conf = 0.9, target = "rules"))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.9    0.1    1 none FALSE            TRUE       5     0.5      1
##  maxlen target  ext
##      10  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 1 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[20 item(s), 2 transaction(s)] done [0.00s].
## sorting and recoding items ... [20 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 7 8 9 10
## Warning in apriori(transx, parameter = list(supp = 0.5, conf = 0.9, target =
## "rules")): Mining stopped (maxlen reached). Only patterns up to a length of 10
## returned!
##  done [0.08s].
## writing ... [5242880 rule(s)] done [0.30s].
## creating S4 object  ... done [1.10s].
#summary of rules
summary(rules)
## set of 5242880 rules
## 
## rule length distribution (lhs + rhs):sizes
##       1       2       3       4       5       6       7       8       9      10 
##      20     380    3420   19380   77520  232560  542640 1007760 1511640 1847560 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   8.000   9.000   8.738  10.000  10.000 
## 
## summary of quality measures:
##     support    confidence    coverage      lift       count  
##  Min.   :1   Min.   :1    Min.   :1   Min.   :1   Min.   :2  
##  1st Qu.:1   1st Qu.:1    1st Qu.:1   1st Qu.:1   1st Qu.:2  
##  Median :1   Median :1    Median :1   Median :1   Median :2  
##  Mean   :1   Mean   :1    Mean   :1   Mean   :1   Mean   :2  
##  3rd Qu.:1   3rd Qu.:1    3rd Qu.:1   3rd Qu.:1   3rd Qu.:2  
##  Max.   :1   Max.   :1    Max.   :1   Max.   :1   Max.   :2  
## 
## mining info:
##    data ntransactions support confidence
##  transx             2     0.5        0.9
##                                                                                call
##  apriori(data = transx, parameter = list(supp = 0.5, conf = 0.9, target = "rules"))
#Inspect rules with the highest lift.

inspect(head(sort(rules, by = "lift")))
##     lhs    rhs        support confidence coverage lift count
## [1] {}  => {Burger}   1       1          1        1    2    
## [2] {}  => {Tyrrells} 1       1          1        1    2    
## [3] {}  => {Twisties} 1       1          1        1    2    
## [4] {}  => {Tostitos} 1       1          1        1    2    
## [5] {}  => {Thins}    1       1          1        1    2    
## [6] {}  => {Sunbites} 1       1          1        1    2
# Visualize the rules
plot(rules, method = "graph")
## Warning: Too many rules supplied. Only plotting the best 100 using
## 'lift' (change control parameter max if needed).

rules1 <- apriori(transx, parameter = list(supp = 0.1, conf = 0.9, target = "rules"))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.9    0.1    1 none FALSE            TRUE       5     0.1      1
##  maxlen target  ext
##      10  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 0 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[20 item(s), 2 transaction(s)] done [0.00s].
## sorting and recoding items ... [20 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 7 8 9 10
## Warning in apriori(transx, parameter = list(supp = 0.1, conf = 0.9, target =
## "rules")): Mining stopped (maxlen reached). Only patterns up to a length of 10
## returned!
##  done [0.07s].
## writing ... [5242880 rule(s)] done [0.28s].
## creating S4 object  ... done [0.89s].
#summary of rules
summary(rules1)
## set of 5242880 rules
## 
## rule length distribution (lhs + rhs):sizes
##       1       2       3       4       5       6       7       8       9      10 
##      20     380    3420   19380   77520  232560  542640 1007760 1511640 1847560 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   8.000   9.000   8.738  10.000  10.000 
## 
## summary of quality measures:
##     support    confidence    coverage      lift       count  
##  Min.   :1   Min.   :1    Min.   :1   Min.   :1   Min.   :2  
##  1st Qu.:1   1st Qu.:1    1st Qu.:1   1st Qu.:1   1st Qu.:2  
##  Median :1   Median :1    Median :1   Median :1   Median :2  
##  Mean   :1   Mean   :1    Mean   :1   Mean   :1   Mean   :2  
##  3rd Qu.:1   3rd Qu.:1    3rd Qu.:1   3rd Qu.:1   3rd Qu.:2  
##  Max.   :1   Max.   :1    Max.   :1   Max.   :1   Max.   :2  
## 
## mining info:
##    data ntransactions support confidence
##  transx             2     0.1        0.9
##                                                                                call
##  apriori(data = transx, parameter = list(supp = 0.1, conf = 0.9, target = "rules"))
#Inspect rules with the highest lift.

inspect(head(sort(rules1, by = "lift")))
##     lhs    rhs        support confidence coverage lift count
## [1] {}  => {Burger}   1       1          1        1    2    
## [2] {}  => {Tyrrells} 1       1          1        1    2    
## [3] {}  => {Twisties} 1       1          1        1    2    
## [4] {}  => {Tostitos} 1       1          1        1    2    
## [5] {}  => {Thins}    1       1          1        1    2    
## [6] {}  => {Sunbites} 1       1          1        1    2
# Visualize the rules
plot(rules1, method = "graph")
## Warning: Too many rules supplied. Only plotting the best 100 using
## 'lift' (change control parameter max if needed).

rules2 <- apriori(transx, parameter = list(supp = 0.9, conf = 0.9, target = "rules"))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.9    0.1    1 none FALSE            TRUE       5     0.9      1
##  maxlen target  ext
##      10  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 1 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[20 item(s), 2 transaction(s)] done [0.00s].
## sorting and recoding items ... [20 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 7 8 9 10
## Warning in apriori(transx, parameter = list(supp = 0.9, conf = 0.9, target =
## "rules")): Mining stopped (maxlen reached). Only patterns up to a length of 10
## returned!
##  done [0.07s].
## writing ... [5242880 rule(s)] done [0.29s].
## creating S4 object  ... done [0.80s].
#summary of rules
summary(rules2)
## set of 5242880 rules
## 
## rule length distribution (lhs + rhs):sizes
##       1       2       3       4       5       6       7       8       9      10 
##      20     380    3420   19380   77520  232560  542640 1007760 1511640 1847560 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   8.000   9.000   8.738  10.000  10.000 
## 
## summary of quality measures:
##     support    confidence    coverage      lift       count  
##  Min.   :1   Min.   :1    Min.   :1   Min.   :1   Min.   :2  
##  1st Qu.:1   1st Qu.:1    1st Qu.:1   1st Qu.:1   1st Qu.:2  
##  Median :1   Median :1    Median :1   Median :1   Median :2  
##  Mean   :1   Mean   :1    Mean   :1   Mean   :1   Mean   :2  
##  3rd Qu.:1   3rd Qu.:1    3rd Qu.:1   3rd Qu.:1   3rd Qu.:2  
##  Max.   :1   Max.   :1    Max.   :1   Max.   :1   Max.   :2  
## 
## mining info:
##    data ntransactions support confidence
##  transx             2     0.9        0.9
##                                                                                call
##  apriori(data = transx, parameter = list(supp = 0.9, conf = 0.9, target = "rules"))
#Inspect rules with the highest lift.

inspect(head(sort(rules2, by = "lift")))
##     lhs    rhs        support confidence coverage lift count
## [1] {}  => {Burger}   1       1          1        1    2    
## [2] {}  => {Tyrrells} 1       1          1        1    2    
## [3] {}  => {Twisties} 1       1          1        1    2    
## [4] {}  => {Tostitos} 1       1          1        1    2    
## [5] {}  => {Thins}    1       1          1        1    2    
## [6] {}  => {Sunbites} 1       1          1        1    2
# Visualize the rules
plot(rules2, method = "graph")
## Warning: Too many rules supplied. Only plotting the best 100 using
## 'lift' (change control parameter max if needed).

rules21 <- apriori(transx, parameter = list(supp = 0.9, conf = 0.1, target = "rules"))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.1    0.1    1 none FALSE            TRUE       5     0.9      1
##  maxlen target  ext
##      10  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 1 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[20 item(s), 2 transaction(s)] done [0.00s].
## sorting and recoding items ... [20 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 7 8 9 10
## Warning in apriori(transx, parameter = list(supp = 0.9, conf = 0.1, target =
## "rules")): Mining stopped (maxlen reached). Only patterns up to a length of 10
## returned!
##  done [0.07s].
## writing ... [5242880 rule(s)] done [0.28s].
## creating S4 object  ... done [0.78s].
#summary of rules
summary(rules21)
## set of 5242880 rules
## 
## rule length distribution (lhs + rhs):sizes
##       1       2       3       4       5       6       7       8       9      10 
##      20     380    3420   19380   77520  232560  542640 1007760 1511640 1847560 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   8.000   9.000   8.738  10.000  10.000 
## 
## summary of quality measures:
##     support    confidence    coverage      lift       count  
##  Min.   :1   Min.   :1    Min.   :1   Min.   :1   Min.   :2  
##  1st Qu.:1   1st Qu.:1    1st Qu.:1   1st Qu.:1   1st Qu.:2  
##  Median :1   Median :1    Median :1   Median :1   Median :2  
##  Mean   :1   Mean   :1    Mean   :1   Mean   :1   Mean   :2  
##  3rd Qu.:1   3rd Qu.:1    3rd Qu.:1   3rd Qu.:1   3rd Qu.:2  
##  Max.   :1   Max.   :1    Max.   :1   Max.   :1   Max.   :2  
## 
## mining info:
##    data ntransactions support confidence
##  transx             2     0.9        0.1
##                                                                                call
##  apriori(data = transx, parameter = list(supp = 0.9, conf = 0.1, target = "rules"))
#Inspect rules with the highest lift.

inspect(head(sort(rules21, by = "lift")))
##     lhs    rhs        support confidence coverage lift count
## [1] {}  => {Burger}   1       1          1        1    2    
## [2] {}  => {Tyrrells} 1       1          1        1    2    
## [3] {}  => {Twisties} 1       1          1        1    2    
## [4] {}  => {Tostitos} 1       1          1        1    2    
## [5] {}  => {Thins}    1       1          1        1    2    
## [6] {}  => {Sunbites} 1       1          1        1    2
# Visualize the rules
plot(rules21, method = "graph")
## Warning: Too many rules supplied. Only plotting the best 100 using
## 'lift' (change control parameter max if needed).

rules22 <- apriori(transx, parameter = list(supp = 0.1, conf = 0.1, target = "rules"))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.1    0.1    1 none FALSE            TRUE       5     0.1      1
##  maxlen target  ext
##      10  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 0 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[20 item(s), 2 transaction(s)] done [0.00s].
## sorting and recoding items ... [20 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 7 8 9 10
## Warning in apriori(transx, parameter = list(supp = 0.1, conf = 0.1, target =
## "rules")): Mining stopped (maxlen reached). Only patterns up to a length of 10
## returned!
##  done [0.07s].
## writing ... [5242880 rule(s)] done [0.28s].
## creating S4 object  ... done [0.71s].
#summary of rules
summary(rules22)
## set of 5242880 rules
## 
## rule length distribution (lhs + rhs):sizes
##       1       2       3       4       5       6       7       8       9      10 
##      20     380    3420   19380   77520  232560  542640 1007760 1511640 1847560 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   8.000   9.000   8.738  10.000  10.000 
## 
## summary of quality measures:
##     support    confidence    coverage      lift       count  
##  Min.   :1   Min.   :1    Min.   :1   Min.   :1   Min.   :2  
##  1st Qu.:1   1st Qu.:1    1st Qu.:1   1st Qu.:1   1st Qu.:2  
##  Median :1   Median :1    Median :1   Median :1   Median :2  
##  Mean   :1   Mean   :1    Mean   :1   Mean   :1   Mean   :2  
##  3rd Qu.:1   3rd Qu.:1    3rd Qu.:1   3rd Qu.:1   3rd Qu.:2  
##  Max.   :1   Max.   :1    Max.   :1   Max.   :1   Max.   :2  
## 
## mining info:
##    data ntransactions support confidence
##  transx             2     0.1        0.1
##                                                                                call
##  apriori(data = transx, parameter = list(supp = 0.1, conf = 0.1, target = "rules"))
#Inspect rules with the highest lift.

inspect(head(sort(rules22, by = "lift")))
##     lhs    rhs        support confidence coverage lift count
## [1] {}  => {Burger}   1       1          1        1    2    
## [2] {}  => {Tyrrells} 1       1          1        1    2    
## [3] {}  => {Twisties} 1       1          1        1    2    
## [4] {}  => {Tostitos} 1       1          1        1    2    
## [5] {}  => {Thins}    1       1          1        1    2    
## [6] {}  => {Sunbites} 1       1          1        1    2
# Visualize the rules
plot(rules22, method = "graph")
## Warning: Too many rules supplied. Only plotting the best 100 using
## 'lift' (change control parameter max if needed).

rules221 <- apriori(transx, parameter = list(supp = 0.5, conf = 0.1, target = "rules"))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.1    0.1    1 none FALSE            TRUE       5     0.5      1
##  maxlen target  ext
##      10  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 1 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[20 item(s), 2 transaction(s)] done [0.00s].
## sorting and recoding items ... [20 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 7 8 9 10
## Warning in apriori(transx, parameter = list(supp = 0.5, conf = 0.1, target =
## "rules")): Mining stopped (maxlen reached). Only patterns up to a length of 10
## returned!
##  done [0.07s].
## writing ... [5242880 rule(s)] done [0.29s].
## creating S4 object  ... done [0.72s].
#summary of rules
summary(rules221)
## set of 5242880 rules
## 
## rule length distribution (lhs + rhs):sizes
##       1       2       3       4       5       6       7       8       9      10 
##      20     380    3420   19380   77520  232560  542640 1007760 1511640 1847560 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   8.000   9.000   8.738  10.000  10.000 
## 
## summary of quality measures:
##     support    confidence    coverage      lift       count  
##  Min.   :1   Min.   :1    Min.   :1   Min.   :1   Min.   :2  
##  1st Qu.:1   1st Qu.:1    1st Qu.:1   1st Qu.:1   1st Qu.:2  
##  Median :1   Median :1    Median :1   Median :1   Median :2  
##  Mean   :1   Mean   :1    Mean   :1   Mean   :1   Mean   :2  
##  3rd Qu.:1   3rd Qu.:1    3rd Qu.:1   3rd Qu.:1   3rd Qu.:2  
##  Max.   :1   Max.   :1    Max.   :1   Max.   :1   Max.   :2  
## 
## mining info:
##    data ntransactions support confidence
##  transx             2     0.5        0.1
##                                                                                call
##  apriori(data = transx, parameter = list(supp = 0.5, conf = 0.1, target = "rules"))
#Inspect rules with the highest lift.

inspect(head(sort(rules221, by = "lift")))
##     lhs    rhs        support confidence coverage lift count
## [1] {}  => {Burger}   1       1          1        1    2    
## [2] {}  => {Tyrrells} 1       1          1        1    2    
## [3] {}  => {Twisties} 1       1          1        1    2    
## [4] {}  => {Tostitos} 1       1          1        1    2    
## [5] {}  => {Thins}    1       1          1        1    2    
## [6] {}  => {Sunbites} 1       1          1        1    2
# Visualize the rules
plot(rules221, method = "graph")
## Warning: Too many rules supplied. Only plotting the best 100 using
## 'lift' (change control parameter max if needed).

#### Deep dive into Mainstream, young singles/couples